from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
!pip install shap==0.43.0 umap-learn==0.5.4
Requirement already satisfied: shap==0.43.0 in /usr/local/lib/python3.10/dist-packages (0.43.0) Requirement already satisfied: umap-learn==0.5.4 in /usr/local/lib/python3.10/dist-packages (0.5.4) Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from shap==0.43.0) (1.23.5) Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from shap==0.43.0) (1.11.3) Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from shap==0.43.0) (1.2.2) Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from shap==0.43.0) (1.5.3) Requirement already satisfied: tqdm>=4.27.0 in /usr/local/lib/python3.10/dist-packages (from shap==0.43.0) (4.66.1) Requirement already satisfied: packaging>20.9 in /usr/local/lib/python3.10/dist-packages (from shap==0.43.0) (23.2) Requirement already satisfied: slicer==0.0.7 in /usr/local/lib/python3.10/dist-packages (from shap==0.43.0) (0.0.7) Requirement already satisfied: numba in /usr/local/lib/python3.10/dist-packages (from shap==0.43.0) (0.56.4) Requirement already satisfied: cloudpickle in /usr/local/lib/python3.10/dist-packages (from shap==0.43.0) (2.2.1) Requirement already satisfied: pynndescent>=0.5 in /usr/local/lib/python3.10/dist-packages (from umap-learn==0.5.4) (0.5.10) Requirement already satisfied: tbb>=2019.0 in /usr/local/lib/python3.10/dist-packages (from umap-learn==0.5.4) (2021.10.0) Requirement already satisfied: llvmlite<0.40,>=0.39.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba->shap==0.43.0) (0.39.1) Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from numba->shap==0.43.0) (67.7.2) Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.10/dist-packages (from pynndescent>=0.5->umap-learn==0.5.4) (1.3.2) Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->shap==0.43.0) (3.2.0) Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->shap==0.43.0) (2.8.2) Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->shap==0.43.0) (2023.3.post1) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->shap==0.43.0) (1.16.0)
import shap
import pandas as pd
import numpy as np
df = pd.read_pickle('/content/drive/MyDrive/pan_cancer_diner/Gene parsing code /11093rx39979c.pkl')
meta= pd.read_pickle('/content/drive/MyDrive/pan_cancer_diner/Clinical data from tcga.R/metadata.pkl')
meta.columns
Index(['Unnamed: 0', 'data_type', 'updated_datetime', 'file_name', 'md5sum',
'data_category', 'experimental_strategy', 'project', 'sample_uuid',
'sample_barcode', 'tumor', 'patient_barcode', 'gender', 'vital',
'days_to_contact', 'days_to_death', 'days_to_birth', 'panel',
'histology', 'tissue_site', 'stage', 'T', 'N', 'M', 'residual_tumor',
'new_tumor_events', 'follow_ups'],
dtype='object')
df
| 5S_rRNA | 5_8S_rRNA | 7SK | A1BG | A1BG-AS1 | A1CF | A2M | A2M-AS1 | A2ML1 | A2ML1-AS1 | ... | ZYG11A | ZYG11AP1 | ZYG11B | ZYX | ZYXP1 | ZZEF1 | ZZZ3 | hsa-mir-1253 | hsa-mir-423 | snoZ196 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Unnamed: 0 | |||||||||||||||||||||
| 84fd87d4-9b47-4852-b45c-1f681a58832c | 4 | 0 | 949 | 48 | 74 | 2 | 27981 | 15 | 288 | 0 | ... | 1 | 0 | 1057 | 4318 | 0 | 973 | 924 | 0 | 0 | 1 |
| 2da36252-af74-4e0c-ae38-5f5fc6bdad6e | 0 | 0 | 285 | 10 | 41 | 4 | 33182 | 80 | 586 | 0 | ... | 0 | 0 | 3719 | 5613 | 0 | 10839 | 1977 | 0 | 0 | 2 |
| 20ef7e72-9b76-4a4f-985c-1c35503b3e86 | 0 | 0 | 82 | 2 | 20 | 0 | 14847 | 115 | 122 | 0 | ... | 185 | 0 | 2622 | 6686 | 0 | 2412 | 2116 | 0 | 0 | 4 |
| 83acd71c-c12a-4394-ba8d-05e9c5ad2cf1 | 1 | 0 | 502 | 44 | 278 | 9 | 50129 | 139 | 26 | 1 | ... | 531 | 0 | 6497 | 9902 | 0 | 7520 | 6272 | 0 | 0 | 0 |
| 04d7ee0b-95ce-4af2-8140-547e9c6bd187 | 2 | 0 | 612 | 6 | 84 | 3 | 31641 | 55 | 518 | 0 | ... | 8 | 0 | 2446 | 11845 | 0 | 4063 | 2797 | 0 | 0 | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5d98ad4a-483f-4032-ae9e-0ab6d398da98 | 3 | 0 | 3455 | 81 | 216 | 1 | 23 | 4 | 10 | 0 | ... | 2 | 0 | 907 | 28542 | 0 | 3221 | 1042 | 0 | 0 | 8 |
| e0822da0-24d1-413a-b87f-5c6a242d3732 | 0 | 0 | 254 | 29 | 192 | 9 | 53226 | 74 | 38 | 1 | ... | 126 | 0 | 2741 | 11978 | 0 | 5163 | 5026 | 0 | 0 | 5 |
| 3a74e5d3-48e7-4df9-85e1-dd02d1c804c1 | 1 | 0 | 95 | 135 | 674 | 0 | 41441 | 54 | 12 | 0 | ... | 72 | 0 | 2296 | 27635 | 0 | 5424 | 2489 | 0 | 0 | 6 |
| 86fd69ec-fbe4-4ca7-a1a5-a42367527929 | 1 | 0 | 150 | 20 | 53 | 3 | 7448 | 21 | 12578 | 0 | ... | 356 | 0 | 854 | 7628 | 0 | 1908 | 1095 | 0 | 0 | 3 |
| a811e4bd-d1d5-4a97-bc7f-899906c7f145 | 2 | 0 | 48 | 45 | 232 | 3 | 67745 | 22 | 19 | 0 | ... | 434 | 0 | 2194 | 8696 | 0 | 2316 | 1478 | 0 | 0 | 4 |
11093 rows × 39979 columns
df = df[meta['tumor'] == True]
df['vital']= meta['vital']
df =df.dropna()
Boolean Series key will be reindexed to match DataFrame index. A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df['vital'].unique()
array(['Alive', 'Dead'], dtype=object)
y = df['vital'].replace({'Alive': 1, 'Dead': 0})
y
Unnamed: 0
84fd87d4-9b47-4852-b45c-1f681a58832c 1
2da36252-af74-4e0c-ae38-5f5fc6bdad6e 1
83acd71c-c12a-4394-ba8d-05e9c5ad2cf1 1
04d7ee0b-95ce-4af2-8140-547e9c6bd187 1
171f45e7-83e7-4286-970a-0dafa6f46d8a 1
..
21ea6561-2bd1-4a3d-ae70-3c504530bb26 0
5d98ad4a-483f-4032-ae9e-0ab6d398da98 0
3a74e5d3-48e7-4df9-85e1-dd02d1c804c1 1
86fd69ec-fbe4-4ca7-a1a5-a42367527929 0
a811e4bd-d1d5-4a97-bc7f-899906c7f145 1
Name: vital, Length: 10339, dtype: int64
y.value_counts()
1 7251 0 3088 Name: vital, dtype: int64
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
# Load sample data
#Without the columun named "vital"
X = np.log(df.iloc[:,:-1].values +1)
y = df['vital'].replace({'Alive': 1, 'Dead': 0})
y = y.values
# Cross-validation setup
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
aucs = []
clf = XGBClassifier(
n_estimators= 1000,
learning_rate=0.01,
max_depth=20,
subsample=0.5,
colsample_bytree=0.2,
colsample_bylevel=0.2,
gamma=2,
reg_alpha=0.5,
reg_lambda=2,
eval_metric='auc',
use_label_encoder=False,
grow_policy='lossguide',
verbosity=3,
#tree_method='gpu_hist',
n_jobs=-1,tree_method = "hist", device = "cuda"
)
"""
clf = XGBClassifier(
n_estimators= 200,
eval_metric='auc',
verbosity=3,
#tree_method='gpu_hist',
random_state=50,
grow_policy='lossguide',
tree_method='gpu_hist',
)
"""
"\nclf = XGBClassifier(\n n_estimators= 200,\n eval_metric='auc',\n verbosity=3,\n #tree_method='gpu_hist',\n random_state=50,\n grow_policy='lossguide',\n tree_method='gpu_hist',\n\n)\n"
StratifiedKFold is a variation of k-fold cross-validation that returns stratified folds. "Stratified" means that each fold is made by preserving the percentage of samples for each class. This is especially useful when you have an imbalanced dataset where one class significantly outnumbers the other(s).
# Create a figure for the ROC curves
plt.figure(figsize=(10, 8))
from sklearn.metrics import roc_curve, roc_auc_score, classification_report
for train_index, test_index in skf.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
clf.fit(X_train, y_train)
# Predict the class labels
y_pred_class = clf.predict(X_test)
# Print the classification report for this fold
print(f"Classification Report for Fold {len(aucs) + 1}:")
print(classification_report(y_test, y_pred_class))
print("-" * 50) # For visual separation between reports
# Predict the probability of positive class
y_pred = clf.predict_proba(X_test)[:, 1]
# Compute and store AUC for the fold
auc = roc_auc_score(y_test, y_pred)
aucs.append(auc)
# Compute ROC curve and plot
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, label=f'AUC (Fold {len(aucs)}): {auc:.2f}')
# Plotting the ROC curves
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for each fold')
plt.legend(loc="lower right")
plt.show()
plt.savefig("ROC.png", dpi=600) # dpi can be adjusted based on desired resolution
# Reporting average and standard deviation of AUC
print(f"Mean AUC: {np.mean(aucs):.2f}")
print(f"Std AUC: {np.std(aucs):.2f}")
[20:08:39] ======== Monitor (0): HostSketchContainer ======== [20:08:39] AllReduce: 1.31617s, 1 calls @ 1316166us [20:08:39] MakeCuts: 1.44378s, 1 calls @ 1443776us [20:08:39] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3 [20:08:39] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure [20:08:40] ======== Monitor (0): ======== [20:08:40] InitCompressedData: 0.008849s, 1 calls @ 8849us [20:14:48] ======== Monitor (0): Learner ======== [20:14:48] Configure: 0.000931s, 1 calls @ 931us [20:14:48] EvalOneIter: 0.010252s, 1000 calls @ 10252us [20:14:48] GetGradient: 0.091119s, 1000 calls @ 91119us [20:14:48] PredictRaw: 0.001497s, 1000 calls @ 1497us [20:14:48] UpdateOneIter: 368.323s, 1000 calls @ 368323197us [20:14:48] ======== Monitor (0): GBTree ======== [20:14:48] BoostNewTrees: 368.212s, 1000 calls @ 368212163us [20:14:48] CommitModel: 0.000947s, 1000 calls @ 947us [20:14:48] ======== Device 0 Memory Allocations: ======== [20:14:48] Peak memory usage: 11829MiB [20:14:48] Number of allocations: 630133 [20:14:48] ======== Monitor (0): updater_gpu_hist ======== [20:14:48] InitData: 0.001516s, 1000 calls @ 1516us [20:14:48] InitDataOnce: 0.001319s, 1 calls @ 1319us [20:14:48] Update: 368.168s, 1000 calls @ 368168311us [20:14:48] UpdatePredictionCache: 0.033227s, 1000 calls @ 33227us [20:14:48] ======== Monitor (0): gradient_based_sampler ======== [20:14:48] Sample: 0.174361s, 1000 calls @ 174361us [20:14:48] ======== Monitor (0): GPUHistMakerDevice0 ======== [20:14:48] AllReduce: 0.045497s, 129374 calls @ 45497us [20:14:48] BuildHist: 1.00499s, 75124 calls @ 1004988us [20:14:48] EvaluateSplits: 309.249s, 75124 calls @ 309249015us [20:14:48] FinalisePosition: 0.059938s, 1000 calls @ 59938us [20:14:48] InitRoot: 52.242s, 1000 calls @ 52242020us [20:14:48] Reset: 1.66683s, 1000 calls @ 1666827us [20:14:48] UpdatePosition: 3.65072s, 75124 calls @ 3650720us [20:14:48] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3 [20:14:48] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[20:14:48] WARNING: /workspace/src/common/error_msg.cc:58: Falling back to prediction using DMatrix due to mismatched devices. This might lead to higher memory usage and slower performance. XGBoost is running on: cuda:0, while the input data is on: cpu. Potential solutions: - Use a data structure that matches the device ordinal in the booster. - Set the device for booster before call to inplace_predict. This warning will only be shown once.
Classification Report for Fold 1:
precision recall f1-score support
0 0.63 0.33 0.43 617
1 0.76 0.92 0.83 1451
accuracy 0.74 2068
macro avg 0.70 0.62 0.63 2068
weighted avg 0.72 0.74 0.71 2068
--------------------------------------------------
[20:15:09] ======== Monitor (0): HostSketchContainer ========
[20:15:09] AllReduce: 1.34397s, 1 calls @ 1343965us
[20:15:09] MakeCuts: 1.48077s, 1 calls @ 1480768us
[20:15:09] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[20:15:09] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[20:15:10] ======== Monitor (0): ========
[20:15:10] InitCompressedData: 0.008957s, 1 calls @ 8957us
[20:21:23] ======== Monitor (0): Learner ========
[20:21:23] Configure: 0.000767s, 1 calls @ 767us
[20:21:23] EvalOneIter: 0.010911s, 1000 calls @ 10911us
[20:21:23] GetGradient: 0.092169s, 1000 calls @ 92169us
[20:21:23] PredictRaw: 0.001585s, 1000 calls @ 1585us
[20:21:23] UpdateOneIter: 373.295s, 1000 calls @ 373295404us
[20:21:23] ======== Monitor (0): GBTree ========
[20:21:23] BoostNewTrees: 373.182s, 1000 calls @ 373182309us
[20:21:23] CommitModel: 0.001027s, 1000 calls @ 1027us
[20:21:23] ======== Device 0 Memory Allocations: ========
[20:21:23] Peak memory usage: 12769MiB
[20:21:23] Number of allocations: 1262846
[20:21:23] ======== Monitor (0): updater_gpu_hist ========
[20:21:23] InitData: 0.000863s, 1000 calls @ 863us
[20:21:23] InitDataOnce: 0.000676s, 1 calls @ 676us
[20:21:23] Update: 373.138s, 1000 calls @ 373138015us
[20:21:23] UpdatePredictionCache: 0.032849s, 1000 calls @ 32849us
[20:21:23] ======== Monitor (0): gradient_based_sampler ========
[20:21:23] Sample: 0.175867s, 1000 calls @ 175867us
[20:21:23] ======== Monitor (0): GPUHistMakerDevice0 ========
[20:21:23] AllReduce: 0.045384s, 131640 calls @ 45384us
[20:21:23] BuildHist: 1.00998s, 75644 calls @ 1009982us
[20:21:23] EvaluateSplits: 314.141s, 75644 calls @ 314141292us
[20:21:23] FinalisePosition: 0.063456s, 1000 calls @ 63456us
[20:21:23] InitRoot: 52.2914s, 1000 calls @ 52291443us
[20:21:23] Reset: 1.6703s, 1000 calls @ 1670298us
[20:21:23] UpdatePosition: 3.66174s, 75644 calls @ 3661742us
[20:21:23] ======== Monitor (0): Learner ========
[20:21:23] Configure: 0.000615s, 1 calls @ 615us
[20:21:23] ======== Monitor (0): GBTree ========
[20:21:23] ======== Device 0 Memory Allocations: ========
[20:21:23] Peak memory usage: 12769MiB
[20:21:23] Number of allocations: 1262846
[20:21:23] ======== Monitor (0): updater_gpu_hist ========
[20:21:23] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[20:21:23] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
Classification Report for Fold 2:
precision recall f1-score support
0 0.68 0.33 0.45 618
1 0.77 0.93 0.84 1450
accuracy 0.75 2068
macro avg 0.72 0.63 0.64 2068
weighted avg 0.74 0.75 0.72 2068
--------------------------------------------------
[20:21:45] ======== Monitor (0): HostSketchContainer ========
[20:21:45] AllReduce: 1.44328s, 1 calls @ 1443277us
[20:21:45] MakeCuts: 1.60879s, 1 calls @ 1608792us
[20:21:45] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[20:21:45] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[20:21:45] ======== Monitor (0): ========
[20:21:45] InitCompressedData: 0.008925s, 1 calls @ 8925us
[20:27:54] ======== Monitor (0): Learner ========
[20:27:54] Configure: 0.001262s, 1 calls @ 1262us
[20:27:54] EvalOneIter: 0.010574s, 1000 calls @ 10574us
[20:27:54] GetGradient: 0.093528s, 1000 calls @ 93528us
[20:27:54] PredictRaw: 0.001559s, 1000 calls @ 1559us
[20:27:54] UpdateOneIter: 368.85s, 1000 calls @ 368849926us
[20:27:54] ======== Monitor (0): GBTree ========
[20:27:54] BoostNewTrees: 368.736s, 1000 calls @ 368735585us
[20:27:54] CommitModel: 0.000968s, 1000 calls @ 968us
[20:27:54] ======== Device 0 Memory Allocations: ========
[20:27:54] Peak memory usage: 13694MiB
[20:27:54] Number of allocations: 1892995
[20:27:54] ======== Monitor (0): updater_gpu_hist ========
[20:27:54] InitData: 0.000861s, 1000 calls @ 861us
[20:27:54] InitDataOnce: 0.000675s, 1 calls @ 675us
[20:27:54] Update: 368.692s, 1000 calls @ 368691518us
[20:27:54] UpdatePredictionCache: 0.032845s, 1000 calls @ 32845us
[20:27:54] ======== Monitor (0): gradient_based_sampler ========
[20:27:54] Sample: 0.175475s, 1000 calls @ 175475us
[20:27:54] ======== Monitor (0): GPUHistMakerDevice0 ========
[20:27:54] AllReduce: 0.04477s, 129606 calls @ 44770us
[20:27:54] BuildHist: 1.00104s, 75322 calls @ 1001038us
[20:27:54] EvaluateSplits: 309.799s, 75322 calls @ 309798851us
[20:27:54] FinalisePosition: 0.064922s, 1000 calls @ 64922us
[20:27:54] InitRoot: 52.2193s, 1000 calls @ 52219262us
[20:27:54] Reset: 1.66512s, 1000 calls @ 1665122us
[20:27:54] UpdatePosition: 3.64545s, 75322 calls @ 3645452us
[20:27:54] ======== Monitor (0): Learner ========
[20:27:54] Configure: 0.000954s, 1 calls @ 954us
[20:27:54] ======== Monitor (0): GBTree ========
[20:27:54] ======== Device 0 Memory Allocations: ========
[20:27:54] Peak memory usage: 13694MiB
[20:27:54] Number of allocations: 1892995
[20:27:54] ======== Monitor (0): updater_gpu_hist ========
[20:27:54] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[20:27:54] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
Classification Report for Fold 3:
precision recall f1-score support
0 0.63 0.37 0.46 618
1 0.77 0.91 0.83 1450
accuracy 0.75 2068
macro avg 0.70 0.64 0.65 2068
weighted avg 0.73 0.75 0.72 2068
--------------------------------------------------
[20:28:13] ======== Monitor (0): HostSketchContainer ========
[20:28:13] AllReduce: 1.25285s, 1 calls @ 1252853us
[20:28:13] MakeCuts: 1.36541s, 1 calls @ 1365414us
[20:28:13] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[20:28:13] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[20:28:14] ======== Monitor (0): ========
[20:28:14] InitCompressedData: 0.008855s, 1 calls @ 8855us
[20:34:20] ======== Monitor (0): Learner ========
[20:34:20] Configure: 0.001267s, 1 calls @ 1267us
[20:34:20] EvalOneIter: 0.010215s, 1000 calls @ 10215us
[20:34:20] GetGradient: 0.093642s, 1000 calls @ 93642us
[20:34:20] PredictRaw: 0.001559s, 1000 calls @ 1559us
[20:34:20] UpdateOneIter: 366.402s, 1000 calls @ 366401654us
[20:34:20] ======== Monitor (0): GBTree ========
[20:34:20] BoostNewTrees: 366.287s, 1000 calls @ 366286987us
[20:34:20] CommitModel: 0.000892s, 1000 calls @ 892us
[20:34:20] ======== Device 0 Memory Allocations: ========
[20:34:20] Peak memory usage: 14620MiB
[20:34:20] Number of allocations: 2518290
[20:34:20] ======== Monitor (0): updater_gpu_hist ========
[20:34:20] InitData: 0.000799s, 1000 calls @ 799us
[20:34:20] InitDataOnce: 0.000628s, 1 calls @ 628us
[20:34:20] Update: 366.243s, 1000 calls @ 366243243us
[20:34:20] UpdatePredictionCache: 0.032925s, 1000 calls @ 32925us
[20:34:20] ======== Monitor (0): gradient_based_sampler ========
[20:34:20] Sample: 0.17309s, 1000 calls @ 173090us
[20:34:20] ======== Monitor (0): GPUHistMakerDevice0 ========
[20:34:20] AllReduce: 0.044932s, 128490 calls @ 44932us
[20:34:20] BuildHist: 0.998775s, 74739 calls @ 998775us
[20:34:20] EvaluateSplits: 307.39s, 74739 calls @ 307390007us
[20:34:20] FinalisePosition: 0.064006s, 1000 calls @ 64006us
[20:34:20] InitRoot: 52.2003s, 1000 calls @ 52200279us
[20:34:20] Reset: 1.65857s, 1000 calls @ 1658569us
[20:34:20] UpdatePosition: 3.63283s, 74739 calls @ 3632833us
[20:34:20] ======== Monitor (0): Learner ========
[20:34:20] Configure: 0.000771s, 1 calls @ 771us
[20:34:20] ======== Monitor (0): GBTree ========
[20:34:20] ======== Device 0 Memory Allocations: ========
[20:34:20] Peak memory usage: 14620MiB
[20:34:20] Number of allocations: 2518290
[20:34:20] ======== Monitor (0): updater_gpu_hist ========
[20:34:20] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[20:34:20] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
Classification Report for Fold 4:
precision recall f1-score support
0 0.65 0.37 0.47 618
1 0.77 0.92 0.84 1450
accuracy 0.75 2068
macro avg 0.71 0.64 0.65 2068
weighted avg 0.74 0.75 0.73 2068
--------------------------------------------------
[20:34:42] ======== Monitor (0): HostSketchContainer ========
[20:34:42] AllReduce: 1.20227s, 1 calls @ 1202273us
[20:34:42] MakeCuts: 1.33984s, 1 calls @ 1339839us
[20:34:42] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[20:34:42] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
[20:34:42] ======== Monitor (0): ========
[20:34:42] InitCompressedData: 0.008832s, 1 calls @ 8832us
[20:40:52] ======== Monitor (0): Learner ========
[20:40:52] Configure: 0.001375s, 1 calls @ 1375us
[20:40:52] EvalOneIter: 0.010712s, 1000 calls @ 10712us
[20:40:52] GetGradient: 0.092329s, 1000 calls @ 92329us
[20:40:52] PredictRaw: 0.001562s, 1000 calls @ 1562us
[20:40:52] UpdateOneIter: 370.296s, 1000 calls @ 370295978us
[20:40:52] ======== Monitor (0): GBTree ========
[20:40:52] BoostNewTrees: 370.182s, 1000 calls @ 370182110us
[20:40:52] CommitModel: 0.001003s, 1000 calls @ 1003us
[20:40:52] ======== Device 0 Memory Allocations: ========
[20:40:52] Peak memory usage: 15553MiB
[20:40:52] Number of allocations: 3147564
[20:40:52] ======== Monitor (0): updater_gpu_hist ========
[20:40:52] InitData: 0.000786s, 1000 calls @ 786us
[20:40:52] InitDataOnce: 0.000619s, 1 calls @ 619us
[20:40:52] Update: 370.138s, 1000 calls @ 370137965us
[20:40:52] UpdatePredictionCache: 0.032805s, 1000 calls @ 32805us
[20:40:52] ======== Monitor (0): gradient_based_sampler ========
[20:40:52] Sample: 0.173104s, 1000 calls @ 173104us
[20:40:52] ======== Monitor (0): GPUHistMakerDevice0 ========
[20:40:52] AllReduce: 0.045781s, 130849 calls @ 45781us
[20:40:52] BuildHist: 1.01018s, 75235 calls @ 1010177us
[20:40:52] EvaluateSplits: 311.165s, 75235 calls @ 311164990us
[20:40:52] FinalisePosition: 0.066201s, 1000 calls @ 66201us
[20:40:52] InitRoot: 52.271s, 1000 calls @ 52270975us
[20:40:52] Reset: 1.66845s, 1000 calls @ 1668446us
[20:40:52] UpdatePosition: 3.65671s, 75235 calls @ 3656707us
[20:40:52] ======== Monitor (0): Learner ========
[20:40:52] Configure: 0.000789s, 1 calls @ 789us
[20:40:52] ======== Monitor (0): GBTree ========
[20:40:52] ======== Device 0 Memory Allocations: ========
[20:40:52] Peak memory usage: 15553MiB
[20:40:52] Number of allocations: 3147564
[20:40:52] ======== Monitor (0): updater_gpu_hist ========
[20:40:52] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3
[20:40:52] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure
Classification Report for Fold 5:
precision recall f1-score support
0 0.67 0.32 0.43 617
1 0.76 0.93 0.84 1450
accuracy 0.75 2067
macro avg 0.72 0.63 0.64 2067
weighted avg 0.73 0.75 0.72 2067
--------------------------------------------------
Mean AUC: 0.78 Std AUC: 0.01
<Figure size 640x480 with 0 Axes>
clf
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=0.2, colsample_bynode=None,
colsample_bytree=0.2, device='cuda', early_stopping_rounds=None,
enable_categorical=False, eval_metric='auc', feature_types=None,
gamma=2, grow_policy='lossguide', importance_type=None,
interaction_constraints=None, learning_rate=0.01, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=20, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=1000, n_jobs=-1,
num_parallel_tree=None, random_state=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=0.2, colsample_bynode=None,
colsample_bytree=0.2, device='cuda', early_stopping_rounds=None,
enable_categorical=False, eval_metric='auc', feature_types=None,
gamma=2, grow_policy='lossguide', importance_type=None,
interaction_constraints=None, learning_rate=0.01, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=20, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=1000, n_jobs=-1,
num_parallel_tree=None, random_state=None, ...)WITH ALL DATA SHAPS
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
# Load sample data
#Without the columun named "vital"
X = np.log(df.iloc[:,:-1].values +1)
y = df['vital'].replace({'Alive': 1, 'Dead': 0})
y = y.values
"""
clf = XGBClassifier(
n_estimators= 100,
eval_metric='auc',
verbosity=3,
#tree_method='gpu_hist',
random_state=50,
grow_policy='lossguide',
tree_method='gpu_hist',
)
"""
"\nclf = XGBClassifier(\n n_estimators= 100,\n eval_metric='auc',\n verbosity=3,\n #tree_method='gpu_hist',\n random_state=50,\n grow_policy='lossguide',\n tree_method='gpu_hist',\n\n)\n"
# Let's compute SHAP values for the last test set after the final cross-validation loop
explainer = shap.TreeExplainer(clf.fit(X,y))
[20:41:28] ======== Monitor (0): HostSketchContainer ======== [20:41:28] AllReduce: 1.49339s, 1 calls @ 1493388us [20:41:28] MakeCuts: 1.62093s, 1 calls @ 1620927us [20:41:28] DEBUG: /workspace/src/gbm/gbtree.cc:130: Using tree method: 3 [20:41:28] DEBUG: /workspace/src/tree/updater_gpu_hist.cu:744: [GPU Hist]: Configure [20:41:28] ======== Monitor (0): ======== [20:41:28] InitCompressedData: 0.011051s, 1 calls @ 11051us [20:49:27] ======== Monitor (0): Learner ======== [20:49:27] Configure: 0.000785s, 1 calls @ 785us [20:49:27] EvalOneIter: 0.011364s, 1000 calls @ 11364us [20:49:27] GetGradient: 0.109481s, 1000 calls @ 109481us [20:49:27] PredictRaw: 0.001669s, 1000 calls @ 1669us [20:49:27] UpdateOneIter: 479.114s, 1000 calls @ 479114061us [20:49:27] ======== Monitor (0): GBTree ======== [20:49:27] BoostNewTrees: 478.983s, 1000 calls @ 478983042us [20:49:27] CommitModel: 0.001046s, 1000 calls @ 1046us [20:49:27] ======== Device 0 Memory Allocations: ======== [20:49:27] Peak memory usage: 16741MiB [20:49:27] Number of allocations: 3926647 [20:49:27] ======== Monitor (0): updater_gpu_hist ======== [20:49:27] InitData: 0.000799s, 1000 calls @ 799us [20:49:27] InitDataOnce: 0.000618s, 1 calls @ 618us [20:49:27] Update: 478.936s, 1000 calls @ 478935552us [20:49:27] UpdatePredictionCache: 0.03559s, 1000 calls @ 35590us [20:49:27] ======== Monitor (0): gradient_based_sampler ======== [20:49:27] Sample: 0.211724s, 1000 calls @ 211724us [20:49:27] ======== Monitor (0): GPUHistMakerDevice0 ======== [20:49:27] AllReduce: 0.056498s, 167249 calls @ 56498us [20:49:27] BuildHist: 1.24579s, 93894 calls @ 1245793us [20:49:27] EvaluateSplits: 409.731s, 93894 calls @ 409730747us [20:49:27] FinalisePosition: 0.07003s, 1000 calls @ 70030us [20:49:27] InitRoot: 61.2494s, 1000 calls @ 61249417us [20:49:27] Reset: 1.72209s, 1000 calls @ 1722089us [20:49:27] UpdatePosition: 4.54865s, 93894 calls @ 4548653us [20:49:27] ======== Monitor (0): Learner ======== [20:49:27] Configure: 0.001535s, 1 calls @ 1535us [20:49:27] ======== Monitor (0): GBTree ======== [20:49:27] ======== Device 0 Memory Allocations: ======== [20:49:27] Peak memory usage: 16741MiB [20:49:27] Number of allocations: 3926647 [20:49:27] ======== Monitor (0): updater_gpu_hist ========
[20:49:27] WARNING: /workspace/src/c_api/c_api.cc:1240: Saving into deprecated binary model format, please consider using `json` or `ubj`. Model format will default to JSON in XGBoost 2.2 if not specified.
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values, df.iloc[:,:-1], plot_type="bar", max_display=100)
import matplotlib
#matplotlib.use('Agg')
import shap
import matplotlib.pyplot as plt
import shap
import matplotlib.pyplot as plt
# Set up a figure
fig, ax = plt.subplots(figsize=(10, 6))
# Generate the SHAP plot
shap.summary_plot(shap_values, df.iloc[:,:-1], plot_type="bar", max_display=30)
plt.savefig("shap_summary_plot_global.png", dpi=600) # dpi can be adjusted based on desired resolution
# Save and display
plt.tight_layout()
plt.show()
<Figure size 640x480 with 0 Axes>
import numpy as np
import pandas as pd
import plotly.express as px
# Asumiendo que shap_values es tu matriz de valores SHAP y ya tienes un DataFrame de pandas df y meta definidos.
shap_df = pd.DataFrame(shap_values, columns=df.columns[:-1], index=df.index)
# Agregar la variable "panel" del DataFrame "meta" al DataFrame SHAP
shap_df["panel"] = meta['panel']
# Calcular valores SHAP absolutos para obtener importancias de características
shap_importances = np.abs(shap_df.iloc[:, :-1]).mean(axis=0)
top = 50
# Obtener los índices de las características principales basadas en sus importancias
top_feature_indices = np.argsort(shap_importances)[-top:]
# Seleccionar las características principales según sus índices
top_features = shap_df.columns[top_feature_indices]
# Derretir el DataFrame para la representación basada en tonos
melted_shap_df = pd.melt(shap_df, id_vars=["panel"], value_vars=top_features, var_name="variable")
# Ordena melted_shap_df por 'variable' y luego por 'panel' para que el orden de los plots coincida con el orden alfabético de las etiquetas
sorted_panels = sorted(melted_shap_df['panel'].unique())
melted_shap_df['panel'] = pd.Categorical(melted_shap_df['panel'], categories=sorted_panels, ordered=True)
# Ahora, ordena por variable y por el orden del panel
melted_shap_df = melted_shap_df.sort_values(by=['variable', 'panel'])
# Crear un diagrama de caja con Plotly
fig = px.box(
melted_shap_df,
x="value",
y="variable",
color="panel",
title="Top 50 SHAP Values by Panel",
labels={"value": "SHAP Value", "variable": "Feature", "panel": "Panel"},
orientation="h",
category_orders={"variable": top_features.tolist()}
)
# Agregar línea vertical en x=0
fig.add_shape(
type='line',
line=dict(
dash='dash',
),
x0=0,
x1=0,
y0=-0.5,
y1=top - 0.5
)
fig.update_layout(
xaxis_title="SHAP Value",
yaxis_title="Feature",
boxmode='group',
margin=dict(l=0, r=0, t=30, b=0)
)
fig.show()
# Guardar la figura en un archivo HTML
fig.write_html("shap_summary_plotly.html")